


###
### IRT estimator
###
#source("../IRT_mixing/mixture_irt.R")
source("mixture_irt.R")

#load(file="all_results.RData") #reslist
#load(file="bigsurveys_recoded_plus_estimates.RData") #data
load(file="all_results4.RData") #reslist
data <- read.dta(file="bigsurveys_recoded4.dta")

policy_ind <- 3:318
years <- unique(data$source)[-5]

###
###  standardize estimated parameters
###

for(i in 1:length(reslist)){
    year <- years[i]
    print(years[i])
    
    sdx <- sd(reslist[[i]]$irt$x)
    meanx <- mean(reslist[[i]]$irt$x)
    reslist[[i]]$irt$a <- reslist[[i]]$irt$a +
           reslist[[i]]$irt$b*meanx
    reslist[[i]]$irt$b <- reslist[[i]]$irt$b*sdx
reslist[[i]]$irt$x <- (reslist[[i]]$irt$x-meanx)/sdx
}

###
### Evaluate dispersion in the discrimination parameters
###
for(i in 1:length(reslist)){
    year <- years[i]
    print(years[i])

    mat <- apply(data[data$source==year,policy_ind],2,as.numeric)
    exclude <- ! colnames(mat) %in% c("cces2012_immigration5","cces2012_immigration6")
    notna <- which(! is.na(colMeans(mat,na.rm=T)) & exclude)
    print(paste("items:",length(notna)))   
    mat <- mat[, notna]
    
    print(paste("max discrimination:",round(max(abs(reslist[[i]]$irt$b)),3)))
    ind <- which(abs(reslist[[i]]$irt$b) > 1/2*max(abs(reslist[[i]]$irt$b)))
    print(paste("items with > half discrimination of max:",length(ind)))
    print(colnames(mat)[ind])
    ind <- which(abs(reslist[[i]]$irt$b) > 1/3*max(abs(reslist[[i]]$irt$b)))
    print(paste("items with > 1/3 discrimination of max:",length(ind)))
    print(colnames(mat)[ind])
    print("--------------")
}

###
### What are the "real" number of items on each survey?
###

# looking at how many questions people actually answered
for(i in years){
    print(i)
    tmp <- data[data$source==i,policy_ind]
    notnas <- apply(tmp,1,function(x){length(which(! is.na(x)))})
    print(summary(notnas))
}

# in CCES 2014, how many people answered more than the median?
tmp <- data[data$source=="CCES_2014",policy_ind]
notnas <- apply(tmp,1,function(x){length(which(! is.na(x)))})
table(notnas > 32)
# it looks like most people were asked 32 questions and answered them

# now look at this by question to identify these 32 questions
notnas <- apply(tmp,2,function(x){length(which(! is.na(x)))})
summary(notnas)
table(notnas > 50000)
cces_2014_ind <- which(notnas > 50000)
cces_2014_names <- colnames(tmp)[cces_2014_ind]

mat <- apply(data[data$source=="CCES_2014",policy_ind],2,as.numeric)
exclude <- ! colnames(mat) %in% c("cces2012_immigration5","cces2012_immigration6")
notna <- which(! is.na(colMeans(mat,na.rm=T)) & exclude)
mat <- mat[, notna]
table(colnames(mat) %in% cces_2014_names)

mat <- mat[,which(colnames(mat) %in% cces_2014_names)]

#item_index <- colnames(mat) %in% cces_2014_names

system.time(res0 <- em_mix_irt(mat, iter=5))

# standardize estimates
sdx <- sd(res0$irt$x)
meanx <- mean(res0$irt$x)
res0$irt$a <- res0$irt$a + res0$irt$b*meanx
res0$irt$b <- res0$irt$b*sdx
res0$irt$x <- (res0$irt$x-meanx)/sdx

betas <- res0$irt$b
alphas <- res0$irt$a
xs <- res0$irt$x

n <- length(xs)
ntrials <- 3
Ms <- 10:length(betas)
trials <- rep(1:ntrials,length(Ms))
Ms <- rep(Ms,each=ntrials)
prop0 <- colMeans(res0$w)
#prop0 <- c(0.73805987,0.18086725,0.08107289)
est_props <- matrix(NA,3,length(Ms))
cor_w1 <- rep(NA,length(Ms))
cor_x <- rep(NA,length(Ms))


for(i in 1:length(Ms)){
   ptm <- proc.time()
   m <- Ms[i]
   trial <- trials[i]
   print(paste(m,trial,sep=","))
   ind <- sample(1:length(betas),m)
   X <- matrix(rep(xs,each=m),n,m,byrow=T)
   B <- matrix(rep(betas[ind],each=n),n,m)
   A <- matrix(rep(alphas[ind],each=n),n,m)
   C <- matrix(rep(res0$ivp$ivp[ind],each=n),n,m)
   W1 <- matrix(rep(res0$w[,1],each=m),n,m,byrow=T) 
   W2 <- matrix(rep(res0$w[,2],each=m),n,m,byrow=T) 
   W3 <- matrix(rep(res0$w[,3],each=m),n,m,byrow=T) 
   pmat <- pnorm(X*B - A)*W1 + C*W2 + .5*W3
   ymat <- matrix(rbinom(n*m,1,as.vector(pmat)),n,m)

   res <- em_mix_irt(ymat, iter=5)

   # initial estimated proportions in each group
   print(prop0)
   est_props[,i] <- colMeans(res$w)
   cor_w1[i] <- cor(res$w[,1],res0$w[,1])
   print(est_props[,i])
   print(cor_w1[i])

   # spatial voter correlation with the initial estimates
   xcor <- cor(res$irt$x,xs)
   res$irt$x <- sign(xcor)*res$irt$x
   cor_x[i] <- cor(res$irt$x, xs)
   print(cor_x[i])
   print(proc.time() - ptm)
   save("est_props","cor_w1","cor_x","Ms","prop0",file="simulation_output_test.RData")
}


pdf("simulation_cces.pdf")
ggplot(data=data.frame(cor_x=cor_x,M=Ms),
           aes(y=cor_x,x=M)) +
    geom_point() + geom_smooth()

ggplot(data=data.frame(cor_w1=cor_w1,M=Ms),
           aes(y=cor_w1,x=M)) +
    geom_point() + geom_smooth()

ggplot(data=data.frame(M=Ms,prop_spatial=est_props[1,],
                       prop_weighted=est_props[2,],
                       prop_random=est_props[3,])) +
    geom_point(aes(y=prop_spatial,x=M),color="red") +
    geom_point(aes(y=prop_weighted,x=M),color="blue") +
    geom_point(aes(y=prop_random,x=M),color="green") +
    geom_smooth(aes(y=prop_spatial,x=M),color="red") +
    geom_smooth(aes(y=prop_weighted,x=M),color="blue") +
    geom_smooth(aes(y=prop_random,x=M),color="green") +
    geom_hline(yintercept=prop0[1], linetype="dashed", color = "red") +
    geom_hline(yintercept=prop0[2], linetype="dashed", color = "blue") +
    geom_hline(yintercept=prop0[3], linetype="dashed", color = "green") 
dev.off()

